Using the tidyverse because it has the libraries to clean, mutate and visualize data. library(data.table) - helps load data faster then read csv
library(tidyverse)
library(data.table)
library(lubridate)
library(magrittr)
library(flexdashboard)
library(ggrepel)
library(formattable)
library(sf)
library(plotly)
library(treemapify)
library(scales)
library(RColorBrewer)
library(formatR)
library(caret)
library(rgdal)
taxi_data_join_4=fread('clean_data_haoguo.csv')
valueBox(scales::dollar(12.32), icon ="fas fa-taxi", color = "black")
$12.32
Withtext <- dollar_format(prefix = " ",suffix=" Miles")
valueBox(Withtext(2.87), icon = "fas fa-taxi", color = "black")
2.87 Miles
Withtext <- dollar_format(prefix = " ",suffix=" Minutes")
valueBox(Withtext(35), icon = "fas fa-taxi", color = "black")
35 Minutes
#Change to show avg distance instead of trips this week
taxi_data_join_4 %>%
ggplot(aes(x = weekday)) + geom_bar(fill = '#FFF0F5') + ylab("Number of Rides") + xlab("Weekday") + scale_x_discrete(
limits = c(
"Monday",
"Tuesday",
"Wednesday",
"Thursday",
"Friday",
"Saturday",
"Sunday"
)
) + ylab("Number of Rides") + xlab("Weekday") + scale_y_continuous(labels = comma) + theme(axis.text = element_text(size = 50),
axis.title = element_text(size = 50)) + theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_line(colour = "black")
) + theme(plot.background = element_rect(fill = 'white', colour = 'black'))
ggplot(data = taxi_data_join_4, aes(x = hour)) + geom_bar(fill = '#FFF0F5') + scale_x_continuous(breaks =
seq(0, 30)) + ylab("Number of Rides") + xlab("Hour") + scale_y_continuous(labels = comma) + theme(axis.text = element_text(size = 50),
axis.title = element_text(size = 50)) + theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_line(colour = "black")
) + theme(plot.background = element_rect(fill = 'white', colour = 'black'))
taxi_data_join_4 %>%
ggplot(aes(x = PU_borough)) + geom_bar(fill = '#FFF0F5') + coord_flip() +
scale_y_continuous(labels = comma) + xlab("Top Pick-Up Locations") + ylab("Number of Rides") + scale_x_discrete(limits = c(
"EWR",
"Bronx",
"Queens",
"Brooklyn",
"Manhattan")) + theme(axis.text = element_text(size = 50),
axis.title = element_text(size = 50)) + theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_line(colour = "black")
) + theme(plot.background = element_rect(fill = 'white', colour = 'black'))
taxi_data_join_4 %>%
ggplot(aes(x = DO_borough)) + geom_bar(fill = '#FFF0F5') + coord_flip() +
scale_y_continuous(labels = comma) + xlab("Top Drop Off Locations") + ylab("Number of Rides") + scale_x_discrete(limits = c(
"EWR",
"Bronx",
"Queens",
"Brooklyn",
"Manhattan")) + theme(axis.text = element_text(size = 50),
axis.title = element_text(size = 50)) + theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_line(colour = "black")
) + theme(plot.background = element_rect(fill = 'white', colour = 'black'))
M=taxi_data_join_4 %>%
filter(taxi_data_join_4$PU_borough=='Manhattan')
library(treemap)
count_weekday=M%>%
select(weekday) %>%
group_by(weekday) %>%
summarize(count = n())
count_weekday <- data.table(count_weekday)
count_weekday <- count_weekday[is.na(weekday) == FALSE, ]
count_weekday <- data.frame(count_weekday)
tm <- treemap(count_weekday , index = c("weekday"),
vSize = "count")
na.omit(M)
## lpep_pickup_datetime lpep_dropoff_datetime date PULocationID
## 1: 2019-04-17 08:58:00 2019-04-17 09:04:30 2019-04-17 41
## 2: 2019-04-17 12:36:53 2019-04-17 12:43:34 2019-04-17 41
## 3: 2019-04-17 12:17:55 2019-04-17 12:25:18 2019-04-17 41
## 4: 2019-04-17 15:49:50 2019-04-17 15:58:01 2019-04-17 41
## 5: 2019-04-18 13:22:23 2019-04-18 13:29:37 2019-04-18 41
## ---
## 2341379: 2020-12-09 15:22:43 2020-12-09 15:31:23 2020-12-09 244
## 2341380: 2020-12-11 10:55:38 2020-12-11 11:05:31 2020-12-11 244
## 2341381: 2020-12-16 14:35:14 2020-12-16 14:48:01 2020-12-16 244
## 2341382: 2020-12-16 14:03:48 2020-12-16 14:13:28 2020-12-16 244
## 2341383: 2020-12-22 15:51:20 2020-12-22 16:00:48 2020-12-22 244
## PU_borough PU_zone PU_service_zone DOLocationID
## 1: Manhattan Central Harlem Boro Zone 42
## 2: Manhattan Central Harlem Boro Zone 42
## 3: Manhattan Central Harlem Boro Zone 42
## 4: Manhattan Central Harlem Boro Zone 42
## 5: Manhattan Central Harlem Boro Zone 42
## ---
## 2341379: Manhattan Washington Heights South Boro Zone 220
## 2341380: Manhattan Washington Heights South Boro Zone 220
## 2341381: Manhattan Washington Heights South Boro Zone 220
## 2341382: Manhattan Washington Heights South Boro Zone 220
## 2341383: Manhattan Washington Heights South Boro Zone 220
## DO_borough DO_zone DO_service_zone
## 1: Manhattan Central Harlem North Boro Zone
## 2: Manhattan Central Harlem North Boro Zone
## 3: Manhattan Central Harlem North Boro Zone
## 4: Manhattan Central Harlem North Boro Zone
## 5: Manhattan Central Harlem North Boro Zone
## ---
## 2341379: Bronx Spuyten Duyvil/Kingsbridge Boro Zone
## 2341380: Bronx Spuyten Duyvil/Kingsbridge Boro Zone
## 2341381: Bronx Spuyten Duyvil/Kingsbridge Boro Zone
## 2341382: Bronx Spuyten Duyvil/Kingsbridge Boro Zone
## 2341383: Bronx Spuyten Duyvil/Kingsbridge Boro Zone
## store_and_fwd_flag passenger_count trip_distance fare_amount extra
## 1: N 1 1.20 7.0 0
## 2: N 1 1.12 7.0 0
## 3: N 1 1.28 7.0 0
## 4: N 1 1.03 7.0 0
## 5: N 1 1.35 7.0 0
## ---
## 2341379: N 1 3.65 12.0 0
## 2341380: N 1 4.06 13.5 0
## 2341381: N 1 3.27 13.0 0
## 2341382: N 1 3.87 13.0 0
## 2341383: N 1 3.50 13.0 0
## mta_tax tip_amount tolls_amount improvement_surcharge total_amount
## 1: 0.5 0 0.0 0.3 7.8
## 2: 0.5 0 0.0 0.3 7.8
## 3: 0.5 0 0.0 0.3 7.8
## 4: 0.5 0 0.0 0.3 7.8
## 5: 0.5 0 0.0 0.3 7.8
## ---
## 2341379: 0.5 0 2.8 0.3 15.6
## 2341380: 0.5 0 2.8 0.3 17.1
## 2341381: 0.5 0 2.8 0.3 16.6
## 2341382: 0.5 0 2.8 0.3 16.6
## 2341383: 0.5 0 2.8 0.3 16.6
## payment_type trip_type congestion_surcharge trip_hours weekday dow
## 1: 2 1 0 0.1083333 Wednesday 4
## 2: 2 1 0 0.1113889 Wednesday 4
## 3: 2 1 0 0.1230556 Wednesday 4
## 4: 2 1 0 0.1363889 Wednesday 4
## 5: 2 1 0 0.1205556 Thursday 5
## ---
## 2341379: 2 1 0 0.1444444 Wednesday 4
## 2341380: 2 1 0 0.1647222 Friday 6
## 2341381: 2 1 0 0.2130556 Wednesday 4
## 2341382: 2 1 0 0.1611111 Wednesday 4
## 2341383: 2 1 0 0.1577778 Tuesday 3
## month_day year month_name hour
## 1: 17 2019 April 8
## 2: 17 2019 April 12
## 3: 17 2019 April 12
## 4: 17 2019 April 15
## 5: 18 2019 April 13
## ---
## 2341379: 9 2020 December 15
## 2341380: 11 2020 December 10
## 2341381: 16 2020 December 14
## 2341382: 16 2020 December 14
## 2341383: 22 2020 December 15
lm_1 <- lm(fare_amount ~trip_distance+trip_hours, M)
a=varImp(lm_1,scale=TRUE)
a=setNames(cbind(rownames(a), a, row.names = NULL),
c("group", "value"))
a <- a %>%
arrange(desc(group)) %>%
mutate(prop = value / sum(a$value) *1000) %>%
mutate(ypos = cumsum(prop)- 0.5*prop )
ggplot(a, aes(x="", y=prop, fill=group)) +
geom_bar(stat="identity", width=1, color="white") +
coord_polar("y", start=0) +
theme_void() +
theme(legend.position="none") +
geom_text(aes(y = ypos, label = group), color = "white", size=6) +
scale_fill_brewer(palette="Set1")
my_spdf2 <- read_sf('taxi_zones_map.json')
plot(my_spdf2)
m=taxi_data_join_4 %>% filter(month_day=='21')%>%filter(dow=='1')%>%filter(hour=='12')
setnames(m, 'PU_zone', "zone")
M3=merge(m,my_spdf2,by='zone')
library(ggplot2)
ggplot(M3)+
geom_sf(aes(fill= total_amount,geometry = geometry))+
scale_fill_gradient(low= "#f1b8f1", high = "#ef5767")